library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(ggplot2)
library(foreach)
library(doSNOW)
Loading required package: iterators
Loading required package: snow
library(plotly)
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Attaching package: ‘plotly’
The following object is masked from ‘package:ggplot2’:
last_plot
The following object is masked from ‘package:stats’:
filter
The following object is masked from ‘package:graphics’:
layout
library(latex2exp)
Attaching package: ‘latex2exp’
The following object is masked from ‘package:plotly’:
TeX
e_culc <- function(samp){
var(samp) * (length(samp) - 1) / length(samp) / mean(samp)
}
Табличная функция распредления логарифмического распределения:
log_prepering <- function(p = 0.5){
tdistr <- (-p) / log(1 - p)
sum_distr <- tdistr
sum <- c(sum_distr)
k <- 2
while(tdistr > 1e-10){
tdistr <- tdistr * p / k * (k - 1)
sum_distr <- sum_distr + tdistr
sum <- c(sum, sum_distr)
k <- k + 1
}
sum
}
Моделирование логарифмического распределения по табличной функции распределения:
# rlog <- function(sum){
# x <- runif(1)
# j <- 1
#
# while (x > sum[j]){
# j <- j + 1
# }
#
# j
# }
rlog <- function(sum){
which.min(sum < runif(1))
}
Моделирование выборки логарифмического распределения:
rnlog <- function(n = 1, sum){
v <- replicate(n, rlog(sum))
if(n == 0)
v <- 0;
v
}
Смоделируем выборку и построим гистограмму:
v <- rnlog(10000, log_prepering(0.75))
hist(v[v < 9], probability = TRUE, breaks = seq(0.5, max(v) + 0.5, 1), xlim = c(0.5, 8.5), xlab = "Sample", main = "")
Заведём вероятность в нуле:
p_0 <- 0.2
Построим таблицу вероятносте для нашего распределения:
log_0_prepering <- function(p = 0.5){
tdistr <- (1 - p_0) * (-p) / log(1 - p)
sum_distr <- p_0
sum <- c(p_0)
k <- 2
while(tdistr > 1e-10){
sum_distr <- sum_distr + tdistr
sum <- c(sum, sum_distr)
tdistr <- tdistr * p / k * (k - 1)
k <- k + 1
}
sum
}
Функция моделирования одной логарифмической с нулём величины:
rlog_0 <- function(p = 0.5, sum){
x <- runif(1)
j <- 1
while (x > sum[j]){
j <- j + 1
}
j - 1
}
Моделирование нескольких случайных величин:
rnlog_0 <- function(n = 1, p = 0.5, sum){
v <- replicate(n, rlog_0(p, sum))
v
}
Смоделируем выборку:
table_log <- log_0_prepering(0.75)
v <- rnlog_0(10000, 0.75, table_log)
hist(v[v < 9], probability = TRUE, breaks = seq(-0.5, max(v) + 0.5, 1), xlim = c(-0.5, 8.5), xlab = "Выборка", ylab = "Вероятность", main = "")
Моделирование биномиально-логарифмического распределения:
rbinomlog <- function(num = 1, n = 1, p = 0.5, sumlog){
res <- c()
for(i in rbinom(num, n, p)){
res <- c(res, sum(0, rnlog(i, sumlog)))
}
res
}
Получение чисел Стирлинга первого рода:
MakeNumStir <- function(data, n){
for (i in 2:n){
for (j in 2:n){
if (i == j){
data[i, j] <- 1 / gamma(i)
} else {
data[i, j] <- data[i - 1, j - 1] / (i - 1) + data[i - 1, j] * (i - 2) / (i - 1)
}
}
}
data
}
nNumStir <- 600
NumStir <- as.data.frame(matrix(nrow = nNumStir, ncol = nNumStir))
for (i in 1:nNumStir){
NumStir[1, i] <- 0
NumStir[i, 1] <- 0
}
NumStir[1, 1] <- 1
NumStir <- MakeNumStir(NumStir, nNumStir)
Вероятности биномиально-логарифмического распределения:
get_prob_binomlog <- function(k, p = 0.5, n = 1, q = 0.5){
res <- 0
alpha <- -1 / log(1 - q)
frac <- 1
for (j in 0:k){
res <- res + frac * NumStir[k + 1, j + 1] * (p * alpha) ^j * (1 - p) ^(k - j)
frac <- frac * (n - j)
}
prob <- (1 - p) ^(n - k) * q ^k * res
if (is.nan(prob) || is.infinite(prob) || prob < 1e-309){
return(1e-309)
}
(1 - p) ^(n - k) * q ^k * res
}
Функция правдоподобия для бином-логарифма:
m_log_lik_binomlog_old <- function(x.in, p = 0.5, n = 1, q = 0.5){
if (q <= 0 || q >= 1 || p <= 0 || p >= 1 || n <= 0){
return(-log(0));
}
res <- 0
for (i in x.in){
res <- res + log(get_prob_binomlog(i, p, n, q))
}
-res
}
m_log_lik_binomlog <- function(x.in, p = 0.5, n = 1, q = 0.5){
if (q <= 0 || q >= 1 || p <= 0 || p >= 1 || n <= 0){
return(-log(0));
}
-sum(sapply(x.in, function(i) log(get_prob_binomlog(i, p, n, q))))
}
Моделирование логарифмически-биномиального распределения:
rlogbinom <- function(num = 1, sumlog, n = 1, p = 0.5){
res <- c()
for(i in rnlog(num, sumlog)){
res <- c(res, sum(0, rbinom(i, n, p)))
}
res
}
Вероятность лог-бинома для \(k = 0, 1, 2, 3, 4\):
get_prob_logbinom_logbinom_0 <- function(q = 0.5, p = 0.5, n = 1){
log(1 - q* (1 - p) ^n) / log(1 - q)
}
get_prob_logbinom_logbinom_1 <- function(q = 0.5, p = 0.5, n = 1){
-q * p * n * (1 - p)^(n - 1) / ((1 - q* (1 - p) ^n) * log(1 - q))
}
get_prob_logbinom_logbinom_2 <- function(q = 0.5, p = 0.5, n = 1){
1 / 2 * (-q * p ^2 * n * (1 - p)^(n - 2) / ((1 - q* (1 - p) ^n) * log(1 - q))) *
(q * n * (1 - p)^(n) / ((1 - q* (1 - p) ^n)) + (n - 1))
}
get_prob_logbinom_logbinom_3 <- function(q = 0.5, p = 0.5, n = 1){
1 / 6 * (-q * p ^3 * n * (1 - p)^(n - 3) / ((1 - q* (1 - p) ^n) * log(1 - q))) *
(2 * q ^2 * n ^2 * (1 - p)^(2 * n) / ((1 - q* (1 - p) ^n) ^2) +
3 * q * n * (n - 1) * (1 - p)^(n) / ((1 - q* (1 - p) ^n)) + (n - 1) * (n - 2))
}
get_prob_logbinom_logbinom_4 <- function(q = 0.5, p = 0.5, n = 1){
1 / 24 * (-q * p ^4 * n * (1 - p)^(n - 4) / ((1 - q* (1 - p) ^n) * log(1 - q))) *
(3 * q ^3 * n ^3 * (1 - p)^(3 * n) / ((1 - q* (1 - p) ^n) ^3) +
12 * q ^2 * n ^2 * (n - 1) * (1 - p)^(2 * n) / ((1 - q* (1 - p) ^n) ^2) +
q * (1 - p)^(n) / ((1 - q* (1 - p) ^n)) * (3 * n * (n - 1) ^2 + 4 * n * (n - 1) * (n - 2)) +
(n - 1) * (n - 2) * (n - 3))
}
Вероятности логарифмически-биномиального распределения:
get_prob_logbinom <- function(k, q = 0.5, p = 0.5, n = 1){
if (k == 0){
get_prob_logbinom_logbinom_0(q, p, n)
} else if (k == 1){
get_prob_logbinom_logbinom_1(q, p, n)
} else if (k == 2){
get_prob_logbinom_logbinom_2(q, p, n)
} else if (k == 3){
get_prob_logbinom_logbinom_3(q, p, n)
} else if (k == 4){
get_prob_logbinom_logbinom_4(q, p, n)
} else if (k >= 5){
1 - sum(sapply(0:4, function(k) get_prob_logbinom(k, q, p, n)))
}
}
Функция правдоподобия для лог-бинома:
m_log_lik_logbinom <- function(x.in, q = 0.5, p = 0.5, n = 1){
if (q <= 0 || q >= 1 || p <= 0 || p >= 1 || n <= 0){
return(-log(0))
}
res <- 0
for (i in x.in){
prob <- get_prob_logbinom(i, q, p, n)
if (prob < 0 || prob > 1 || is.nan(prob))
return(-log(0))
res <- res + log(prob)
}
-res
}
Моделирование логарифмически-пуассоновское распределения:
rlogpois <- function(num = 1, sumlog, lambda = 1){
res <- c()
for(i in rnlog(num, sumlog)){
res <- c(res, sum(0, rpois(i, lambda)))
}
res
}
Числа для ЛПР:
MakeNumPois <- function(data, n){
for (i in 2:n){
for (j in 2:n){
if (i == j + 1){
# data[i, j] <- 1 / gamma(i)
data[i, j] <- 1 / gamma(i + 1)
} else {
# data[i, j] <- data[i - 1, j - 1] / (i - 1) + data[i - 1, j] * (i - 2) / (i - 1)
data[i, j] <- j * data[i - 1, j] / i + (i - j) * data[i - 1, j - 1] / i
}
}
}
data
}
nNumPois <- 600
NumPois <- as.data.frame(matrix(nrow = nNumPois, ncol = nNumPois))
for (i in 1:nNumPois){
NumPois[i, 1] <- 1 / gamma(i + 1)
NumPois[1, i] <- 0
}
NumPois[1, 1] <- 1
NumPois <- MakeNumPois(NumPois, nNumPois)
Вероятности логарифмически-пуассоновского распределения:
get_prob_logpois <- function(k, q = 0.5, lambda = 1){
res <- 0
alpha <- -1 / log(1 - q)
m <- q * exp(-lambda)
if (k == 0){
-alpha * log(1 - m)
} else if (k == 1){
alpha * lambda * m / (1 - m)
} else {
for (j in 1:(k - 1)){
res <- res + NumPois[k, j] * m ** j
}
res * lambda ** k / (1 - m) ** k * alpha
}
}
get_prob_logpois_fullprob <- function(k, q = 0.5, lambda = 1){
res <- 0
alpha <- -1 / log(1 - q)
for (j in 1:1000){
part <- q ** j / gamma(k + 1) * j ** (k - 1) * exp(-j * lambda)
if (part == 0){
break
}
res <- res + part
}
res * alpha * lambda ** k
}
\[ P(S _\tau = k) = \frac 1 {k !} \frac {\lambda ^k} {(1 - m) ^k} \sum \limits _{j = 1} ^{k - 1} t(k, j) m ^j, \quad k = 2, ... \]
\[ t(k, j) = j \cdot t (k - 1, j) + (k - j) \cdot t(k - 1, j - 1) \]
\[ \begin{aligned} \left(\frac {\lambda ^k} {(1 - m) ^k} \sum \limits _{j = 1} ^{k - 1} t(k, j) m ^j\right)' =& \frac {km\lambda ^{k + 1}} {(1 - m) ^{k + 1}} \sum \limits _{j = 1} ^{k - 1} t(k, j) m ^j + \frac {(1 - m)\lambda ^k} {(1 - m) ^{k + 1}} \sum \limits _{j = 1} ^{k - 1} t(k, j) \lambda j m ^j =\\ =& \frac {\lambda ^{k + 1}} {(1 - m) ^{k + 1}} \left(\sum \limits _{j = 1} ^{k - 1} t(k, j) k m ^{j + 1} + \sum \limits _{j = 1} ^{k - 1} t(k, j) j m ^j - \sum \limits _{j = 1} ^{k - 1} t(k, j) j m ^{j + 1}\right) =\\ =& \frac {\lambda ^{k + 1}} {(1 - m) ^{k + 1}} \left(\sum \limits _{j = 2} ^{k} \left(t(k, j) j +t(k, j - 1) (k - j + 1)\right) m ^j + t(k, 1) m\right) =\\ =& \frac {\lambda ^{k + 1}} {(1 - m) ^{k + 1}} \left(\sum \limits _{j = 2} ^{k} t(k + 1, j) m ^j + t(k + 1, 1) m\right) =\\ =& \frac {\lambda ^{k + 1}} {(1 - m) ^{k + 1}} \sum \limits _{j = 1} ^{k} t(k + 1, j) m ^j \end{aligned} \]
Вероятность лог-пуассона для \(k = 0, 1, 2, 3, 4\):
# get_prob_logpois_logpois_0 <- function(q = 0.5, lambda = 1){
# log(1 - q * exp(-lambda)) / log(1 - q)
# }
#
# get_prob_logpois_logpois_1 <- function(q = 0.5, lambda = 1){
# -q * lambda * exp(-lambda) / ((1 - q* exp(-lambda)) * log(1 - q))
# }
#
# get_prob_logpois_logpois_2 <- function(q = 0.5, lambda = 1){
# 1 / 2 * (-q * lambda ^ 2 * exp(-lambda) / ((1 - q* exp(-lambda)) ^ 2 * log(1 - q)))
# }
#
# get_prob_logpois_logpois_3 <- function(q = 0.5, lambda = 1){
# 1 / 6 * (q * exp(-lambda) + 1) * (-q * lambda ^ 3 * exp(-lambda) / ((1 - q* exp(-lambda)) ^ 3 * log(1 - q)))
# }
#
# get_prob_logpois_logpois_4 <- function(q = 0.5, lambda = 1){
# 1 / 24 * (q ** 2 * exp(-2 * lambda) + 4 * q * exp(-lambda) + 1) * (-q * lambda ^ 4 * exp(-lambda) / ((1 - q* exp(-lambda)) ^ 4 * log(1 - q)))
# }
Вероятности логарифмически-пуассоновского распределения:
# get_prob_logpois <- function(k, q = 0.5, lambda = 1){
# if (k == 0){
# get_prob_logpois_logpois_0(q, lambda)
# } else if (k == 1){
# get_prob_logpois_logpois_1(q, lambda)
# } else if (k == 2){
# get_prob_logpois_logpois_2(q, lambda)
# } else if (k == 3){
# get_prob_logpois_logpois_3(q, lambda)
# } else if (k == 4){
# get_prob_logpois_logpois_4(q, lambda)
# } else if (k >= 5){
# 1 - sum(sapply(0:4, function(k) get_prob_logpois(k, q, lambda)))
# }
# }
Гистограмма распределения:
samLPR <- rlogpois(10000, log_prepering(0.25), lambda = 3.45)
hist.default(samLPR, breaks = seq(-0.5, max(samLPR) + 0.5, 1), probability = TRUE)
points(x = 0:max(samLPR), y = sapply(0:max(samLPR), function(k) get_prob_logpois(k, 0.25, 3.45)))
probLBR1 <- sapply(0:max(samLPR), function(k) get_prob_logpois(k, 0.25, 3.45))
probLBR2 <- sapply(0:max(samLPR), function(k) get_prob_logpois_fullprob(k, 0.25, 3.45))
Функция правдоподобия для лог-бинома:
m_log_lik_logpois <- function(x.in, q = 0.5, lambda = 1){
if (q <= 0 || q >= 1 || lambda <= 0){
return(-log(0))
}
res <- 0
for (i in x.in){
prob <- get_prob_logpois(i, q, lambda)
if (prob < 0 || prob > 1 || is.nan(prob))
return(-log(0))
res <- res + log(prob)
}
-res
}
Функция правдоподобия негативного бинома:
m_log_lik_nbinom <- function(x.in, q = 0.5, n = 1){
if (q <= 0 || q >= 1 || n <= 0){
return(-log(0));
}
res <- 0
for (i in x.in){
prob <- dnbinom(i, n, q)
if (prob < 0 || prob > 1 || is.nan(prob))
return(-log(0))
res <- res + log(prob)
}
-res
}
Моделирование свёртки отрицательно-биномиальных распределений:
rdoublenbinom <- function(num = 1, p_1 = 0.5, size_1= 1, p_2 = 0.5, size_2 = 1){
rnbinom(num, size_1, p_1) + rnbinom(num, size_2, p_2)
}
Вероятности свёртки негативных биномов:
get_prob_doublenbinom <- function(k, p_1 = 0.5, size_1= 1, p_2 = 0.5, size_2 = 1){
res <- 0
for (m in 0:k){
res <- res + choose(m + size_1 - 1, m) * choose(k - m + size_2 - 1, k - m) * ((1 - p_1) ** m) * ((1 - p_2) ** (k - m))
}
return(res * (p_1 ** size_1) * (p_2 ** size_2))
}
Функция правдоподобия свёртки негативных биномов:
m_log_lik_doublenbinom <- function(x.in, p_1 = 0.5, size_1= 1, p_2 = 0.5, size_2 = 1){
if (p_1 < 0 | p_1 > 1 | p_2 < 0 | p_2 > 1 | size_1 <= 0 | size_2 <= 0)
return(-log(0))
res <- 0
for (i in x.in){
prob <- get_prob_doublenbinom(i, p_1, size_1, p_2, size_2)
if (prob < 0 || prob > 1 || is.nan(prob))
return(-log(0))
res <- res + log(prob)
}
-res
}
m_log_lik_pois <- function(x.in, lambda = 1){
if (lambda < 0)
return(-log(0))
res <- 0
for (i in x.in){
prob <- dpois(i, lambda = lambda)
if (prob < 0 || prob > 1 || is.nan(prob))
return(-log(0))
res <- res + log(prob)
}
-res
}
Создание выборки по частотам значений случайной величины:
generate_sample_old <- function(num_k){
sam <- c()
for (i in 1:length(num_k)){
sam <- c(sam, rep.int(i - 1, num_k[i]))
}
sam
}
generate_sample <- function(num_k){
rep.int(0:(length(num_k) - 1), num_k)
}
Статистика критерия \(\chi ^2\):
my_chisq_old <- function(exp_prob, prob){
res <- 0
for (i in 1:length(prob)){
res <- res + (exp_prob[i] - prob[i])^2/prob[i]
}
res
}
my_chisq <- function(exp_prob, prob){
sum((exp_prob - prob)^2 / prob)
}
Проверка гипотезы о соответствии теоритического распредления эмперическому:
```{r}
Error: attempt to use zero-length variable name
Функция правдоподобия для ЛБР и БЛР:
funcMP_3D <- function(data, distr = 'binomlog', inter_par1 = c(0.01, 0.99, 100), inter_par2 = c(1, 100, 10)){
library(rgl)
open3d()
lines3d(c(0, 0), c(0, 0), c(0, 12), color = "gray");
lines3d(c(0, 12), c(0, 0), c(0, 0), color = "gray");
lines3d(c(0, 0), c(0, 12), c(0, 0), color = "gray");
sam <- generate_sample(data)
N <- length(sam)
var_sam = var(sam) * (N - 1) / N
mean_sam = mean(sam)
x <- c()
y <- c()
z <- c()
for (par1 in seq(inter_par1[1], inter_par1[2], (inter_par1[2] - inter_par1[1]) / inter_par1[3])){
for (par2 in seq(inter_par2[1], inter_par2[2], (inter_par2[2] - inter_par2[1]) / inter_par2[3])){
if (distr == 'binomlog'){
value <- m_log_lik_binomlog(sam, p = (var_sam / mean_sam * log(1 - par1) * (1 - par1) - log(1 - par1)) / par1, n = par2, q = par1)
} else if (distr == 'logbinom'){
value <- m_log_lik_logbinom(sam, q = par1, p = - log(1 - par1) * (1 - par1) / par2 / par1 * mean_sam, n = par2)
} else if (distr == 'logpois'){
value <- m_log_lik_logpois(sam, q = par1, lambda = par2)
}
if (value != 0) {
x <- c(x, par1)
y <- c(y, par2)
z <- c(z, value)
}
}
}
z <- z - min(z)
if (distr == 'binomlog'){
z <- sqrt(z) / 10
points3d(10 * x[z < 12], y[z < 12] * 10 / inter_par2[2], z[z < 12], color ="black")
} else if (distr == 'logbinom'){
points3d(10 * x[z < 12], y[z < 12] * 10 / inter_par2[2], z[z < 12], color ="black")
} else if (distr == 'logpois'){
points3d(10 * x[z < 12], y[z < 12] * 10 / inter_par2[2], z[z < 12], color ="black")
}
}
Проверка оценки для ЛБР:
samLBR <- rlogbinom(1000, log_prepering(0.4), 6, 0.3)
samLBR <- sapply(samLBR, function(x) if (x > 4){5} else {x})
histLBR <- hist(as.numeric(samLBR), probability = TRUE, breaks = seq(-0.5, max(samLBR) + 0.5, 1), xlim = c(-0.5, max(samLBR) + 0.5), plot = TRUE)
res <- hist_make(0, as.numeric(histLBR$counts), get_hist = TRUE, 'logbinom')
# funcMP_3D(histLBR$counts, distr = 'logbinom')
res
[1] 5.0000000 0.4352014 0.3511828 0.7987618
q_LBR <- c()
n_LBR <- c()
p_LBR <- c()
seq_LBR <- seq(100, 10000, 500)
samLBR <- rlogbinom(10000, log_prepering(0.4), 6, 0.3)
samLBR <- sapply(samLBR, function(x) if (x > 4){5} else {x})
for (i in seq_LBR){
histLBR <- hist(as.numeric(samLBR[1:i]), breaks = seq(-0.5, max(samLBR) + 0.5, 1), plot = FALSE)
res <- hist_make(0, as.numeric(histLBR$counts), get_hist = FALSE, 'logbinom')
n_LBR <- c(n_LBR, res[1])
q_LBR <- c(q_LBR, res[2])
p_LBR <- c(p_LBR, res[3])
}
Проверка оценки для БЛР:
q_BLR <- c()
n_BLR <- c()
p_BLR <- c()
seq_BLR <- seq(100, 10000, 500)
samBLR <- rbinomlog(10000, 6, 0.3, log_prepering(0.4))
for (i in seq_BLR){
histBLR <- hist(as.numeric(samBLR[1:i]), breaks = seq(-0.5, max(samBLR) + 0.5, 1), plot = FALSE)
res <- hist_make(0, as.numeric(histBLR$counts), get_hist = FALSE, 'binomlog')
n_BLR <- c(n_BLR, res[1])
q_BLR <- c(q_BLR, res[2])
p_BLR <- c(p_BLR, res[3])
print(i)
}
[1] 100
[1] 600
[1] 1100
[1] 1600
[1] 2100
[1] 2600
[1] 3100
[1] 3600
[1] 4100
[1] 4600
[1] 5100
[1] 5600
[1] 6100
[1] 6600
[1] 7100
[1] 7600
[1] 8100
[1] 8600
[1] 9100
[1] 9600
par(mfrow = c(1, 3))
plot(x = seq_BLR, y = q_BLR, xlab = "Размер выборки", ylab = "Оценка параметра q логарифма")
abline(h = 0.4, lty = "dashed")
legend(x = "bottomright", legend = "Истинное значение", lty = 2, col = 1, lwd = 2)
plot(x = seq_BLR, y = n_BLR, xlab = "Размер выборки", ylab = "Оценка параметра n бинома")
abline(h = 6, lty = "dashed")
legend(x = "topright", legend = "Истинное значение", lty = 2, col = 1, lwd = 2)
plot(x = seq_BLR, y = p_BLR, xlab = "Размер выборки", ylab = "Оценка параметра p бинома")
abline(h = 0.3, lty = "dashed")
legend(x = "bottomright", legend = "Истинное значение", lty = 2, col = 1, lwd = 2)
par(mfrow = c(1, 1))
Проверка оценки для ЛПР:
# funcMP_3D(histLPR$counts, distr = 'logpois', inter_par1 = c(0.01, 0.99, 100), inter_par2 = c(3.1, 4.1, 10))
res
[1] 3.5960192 0.2615519 0.1376612
Состоятельность:
q_LPR <- c()
lambda_LPR <- c()
samLPR <- rlogpois(10000, log_prepering(0.25), 3.45)
for (i in seq(100, 10000, 100)){
histLPR <- hist(as.numeric(samLPR[1:i]), breaks = seq(-0.5, max(samLPR) + 0.5, 1), plot = FALSE)
res <- hist_make(0, as.numeric(histLPR$counts), get_hist = FALSE, 'logpois')
lambda_LPR <- c(lambda_LPR, res[1])
q_LPR <- c(q_LPR, res[2])
}
plot(x = seq(100, 10000, 100), y = q_LPR, xlab = "Размер выборки", ylab = "Оценка параметра логарифма")
abline(h = 0.25, lty = "dashed")
legend(x = "bottomright", legend = "Истинное значение", lty = 2, col = 1, lwd = 2)
plot(x = seq(100, 10000, 100), y = lambda_LPR, xlab = "Размер выборки", ylab = "Оценка параметра Пуассона")
abline(h = 3.45, lty = "dashed")
legend(x = "bottomright", legend = "Истинное значение", lty = 2, col = 1, lwd = 2)
samNBR <- rnbinom(100, 3, 0.4)
samNBR <- sapply(samNBR, function(x) if (x > 4){5} else {x})
histLPR <- hist(as.numeric(samNBR), probability = TRUE, breaks = seq(-0.5, max(samNBR) + 0.5, 1), xlim = c(-0.5, max(samNBR) + 0.5), plot = TRUE)
res <- hist_make(0, as.numeric(histLPR$counts), get_hist = TRUE, 'nbinom')
funcMP_3D(histLPR$counts, distr = '', inter_par1 = c(0.01, 0.99, 100), inter_par2 = c(0.01, 2.99, 100))
res
Загрузка радиобиологических данных:
m <- read.csv("./VitroVivo.csv")
df.BLR <- data.frame(n = rep(0, 19), q = rep(0, 19), p = rep(0, 19), p_value = rep(0, 19))
df.LBR <- data.frame(n = rep(0, 19), q = rep(0, 19), p = rep(0, 19), p_value = rep(0, 19))
df.LPR <- data.frame(lambda = rep(0, 19), q = rep(0, 19), p_value = rep(0, 19))
df.NBR <- data.frame(n = rep(0, 19), q = rep(0, 19), p_value = rep(0, 19))
Рассеяние:
e.df <- data.frame(e = sapply(1:19, function(i) e_culc(generate_sample(m[i, ]))))
e.df
Нахождение \(n, q, p\) для которых p-value максимально:
for (i in 1:19){
print(i)
df.BLR[i, ] <- hist_make(0, as.numeric(m[i,]), get_hist = FALSE, 'binomlog')
# df.LBR[i, ] <- hist_make(0, as.numeric(m[i,]), get_hist = FALSE, 'logbinom')
# df.LPR[i, ] <- hist_make(0, as.numeric(m[i,]), get_hist = FALSE, 'logpois')
# df.NBR[i, ] <- hist_make(0, as.numeric(m[i,]), get_hist = FALSE, 'nbinom')
# df.PR[i, ] <- hist_make(0, as.numeric(m[i,]), get_hist = FALSE, 'pois')
}
[1] 1
Error in optim(c(2, 1/nLeft), function(x) m_log_lik_binomlog(sam, p = x[2], :
function cannot be evaluated at initial parameters
Найденные оценки и p-value:
df.BLR
df.LPR <- df.LPR |> mutate(mean_log = -q / log(1 - q) / (1 - q))
df.LPR
df.NBR <- df.NBR |> mutate(lambda = -n / log(1 - q))
df.NBR
Гистограммы для строки данных:
Функция правдоподобия:
funcMP_3D(as.numeric(m[11,]), distr = 'logpois', inter_par1 = c(0.01, 0.99, 100), inter_par2 = c(0.01, 2.99, 100))
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Проверка на стабильность модели:
plot(x = 1:50, y = seq(0, 1, length.out = 50), col = "white")
for (j in 11:19){
n <- 1:50
res <- c()
for (i in n){
if (j <= 10){
res <- c(res, hist_make(i, as.numeric(m[j,]), get_hist = FALSE, 'binomlog')[4])
}
else{
res <- c(res, hist_make(i, as.numeric(m[j,]), get_hist = FALSE, 'logbinom')[4])
}
}
lines(x = n, y = res, col = j)
}
Корреляция с радиацией параметров и среднего:
plot(seq(0, 45, 5), df.BLR$p[1:10], type = "l", col = "blue", ylim = c(0, max(df.BLR$p)), ylab = "Parametr p", xlab = "Dose, Gy", main = "Black - in vitro; Blue - in vivo")
lines(seq(0, 40, 5), df.BLR$p[11:19], type = "b", col = "black")
plot(seq(0, 45, 5), df.BLR$q[1:10], type = "l", col = "blue", ylim = c(0, max(df.BLR$q)), ylab = "Parametr q", xlab = "Dose, Gy", main = "Black - in vitro; Blue - in vivo")
lines(seq(0, 40, 5), df.BLR$q[11:19], type = "b", col = "black")
plot(seq(0, 45, 5), df.BLR$n[1:10], type = "l", ylim = c(0, max(df.BLR)), ylab = "Параметр n", xlab = "Доза облучения, Гр")
lines(seq(0, 40, 5), df.BLR$n[11:19], lty = 2)
legend(x = "bottomright", legend = c("in vivo", "in vitro"), lty = c(1, 2), col = 1, lwd = 2)
plot(seq(0, 45, 5), df.LBR$p[1:10], type = "l", col = "blue", ylim = c(0, max(df.LBR$p)), ylab = "Parametr p", xlab = "Dose, Gy", main = "Black - in vitro; Blue - in vivo")
lines(seq(0, 40, 5), df.LBR$p[11:19], type = "b", col = "black")
plot(seq(0, 45, 5), df.LBR$q[1:10], type = "l", col = "blue", ylim = c(0, max(df.LBR$q)), ylab = "Parametr q", xlab = "Dose, Gy", main = "Black - in vitro; Blue - in vivo")
lines(seq(0, 40, 5), df.LBR$q[11:19], type = "b", col = "black")
plot(seq(0, 45, 5), df.LBR$n[1:10], type = "l", col = "blue", ylim = c(0, max(df.LBR)), ylab = "Parametr n", xlab = "Dose, Gy", main = "Black - in vitro; Blue - in vivo")
lines(seq(0, 40, 5), df.LBR$n[11:19], type = "b", col = "black")
plot(seq(0, 45, 5), df.LPR$mean_log[1:10], type = "l", ylim = c(0.9, max(c(df.LPR$mean_log[1:10], df.LPR$mean_log[11:19]))), ylab = "Среднее логарифмического", xlab = "Доза облучения, Гр")
lines(seq(0, 40, 5), df.LPR$mean_log[11:19], lty = 2)
legend(x = "topright", legend = c("in vivo", "in vitro"), lty = c(1, 2), col = 1, lwd = 2)
plot(seq(0, 45, 5), df.LPR$lambda[1:10], type = "l", ylim = c(0, max(df.LPR$lambda)), ylab = "Среднее пуассоновского", xlab = "Доза облучения, Гр")
lines(seq(0, 40, 5), df.LPR$lambda[11:19], lty = 2)
legend(x = "bottomright", legend = c("in vivo", "in vitro"), lty = c(1, 2), col = 1, lwd = 2)
plot(seq(0, 45, 5), df.NB$q[1:10], type = "l", col = "blue", ylim = c(0, max(df.NB$q)), ylab = "Parametr q", xlab = "Dose, Gy", main = "Black - in vitro; Blue - in vivo")
lines(seq(0, 40, 5), df.NB$q[11:19], type = "b", col = "black")
plot(seq(0, 45, 5), df.NB$n[1:10], type = "l", col = "blue", ylim = c(0, max(df.NB$n)), ylab = "Parametr n", xlab = "Dose, Gy", main = "Black - in vitro; Blue - in vivo")
lines(seq(0, 40, 5), df.NB$n[11:19], type = "b", col = "black")
plot(seq(0, 45, 5), df.NBR$lambda[1:10], type = "l", col = "blue", ylim = c(0, max(df.NBR$lambda[1:10])), ylab = "Parametr lambda", xlab = "Dose, Gy", main = "Black - in vitro; Blue - in vivo")
lines(seq(0, 40, 5), df.NBR$lambda[11:19], type = "b", col = "black")
Загрузим данные:
Рассчитаем параметры для первых \(1000\) слов:
df.word.nbinom <- data.frame()
start_time <- Sys.time()
cl <- makeCluster(5, type = "SOCK")
registerDoSNOW(cl)
df.word.nbinom <- foreach(i = 1:num.word, .combine = rbind, .inorder = TRUE) %dopar% {
histWord <- hist(as.numeric(WordSample[i, ]), probability = TRUE, breaks = seq(-0.5, max(WordSample[i, ]) + 0.5, 1), xlim = c(-0.5, max(WordSample[i, ]) + 0.5), plot = FALSE)
res <- hist_make(0, histWord$counts, get_hist = FALSE, distr = 'nbinom')
c(row.names(WordSample)[i], res)
}
colnames(df.word.nbinom) <- c("name", "n", "q", "p_value")
df.word.nbinom <- as.data.frame(df.word.nbinom)
df.word.nbinom <- df.word.nbinom |> mutate(n = as.numeric(n), q = as.numeric(q), p_value = as.numeric(p_value))
timediff <- difftime(Sys.time(),start_time)
cat("Расчёт занял: ", timediff, units(timediff))
Точечный график параметров для полученных слов:
df.word.nbinom |> filter(n < 20) |> mutate(p_value_group = as.factor(ifelse(p_value < 0.1, 1, 2))) |>
ggplot(aes(x = q, y = n, color = p_value_group)) +
geom_point()
Рассчитаем параметры для первых \(1000\) слов:
df.word.binlog <- data.frame()
start_time <- Sys.time()
cl <- makeCluster(5, type = "SOCK")
registerDoSNOW(cl)
df.word.binlog <- foreach(i = 1:num.word, .combine = rbind, .inorder = TRUE) %dopar% {
print(i)
histWord <- hist(as.numeric(WordSample[i, ]), probability = TRUE, breaks = seq(-0.5, max(WordSample[i, ]) + 0.5, 1), xlim = c(-0.5, max(WordSample[i, ]) + 0.5), plot = FALSE)
res <- hist_make(0, histWord$counts, get_hist = FALSE, distr = 'binomlog')
c(row.names(WordSample)[i], res)
}
colnames(df.word.binlog) <- c("name", "n", "q", "p", "p_value")
df.word.binlog <- as.data.frame(df.word.binlog)
df.word.binlog <- df.word.binlog |> mutate(n = as.numeric(n), q = as.numeric(q), p = as.numeric(p), p_value = as.numeric(p_value))
timediff <- difftime(Sys.time(),start_time)
cat("Расчёт занял: ", timediff, units(timediff))
Расчёт занял: 1.572357 hours
Проверка оценки для свёртки отрицательных биномов:
samDNB <- rdoublenbinom(1000, 0.8, 3, 0.5, 10)
histDNB <- hist(as.numeric(samDNB), probability = TRUE, breaks = seq(-0.5, max(samDNB) + 0.5, 1), xlim = c(-0.5, max(samDNB) + 0.5), plot = TRUE)
res <- hist_make(0, as.numeric(histDNB$counts), get_hist = TRUE, 'doublenbinom')
res
Рассчитаем параметры для первых \(1000\) слов:
df.word.doublenbinom <- data.frame()
start_time <- Sys.time()
cl <- makeCluster(5, type = "SOCK")
registerDoSNOW(cl)
df.word.doublenbinom <- foreach(i = 1:num.word, .combine = rbind, .inorder = TRUE) %dopar% {
histWord <- hist(as.numeric(WordSample[i, ]), probability = TRUE, breaks = seq(-0.5, max(WordSample[i, ]) + 0.5, 1), xlim = c(-0.5, max(WordSample[i, ]) + 0.5), plot = FALSE)
res <- hist_make(0, histWord$counts, get_hist = FALSE, distr = 'doublenbinom')
c(row.names(WordSample)[i], res)
}
colnames(df.word.doublenbinom) <- c("name", "p1", "size1", "p2", "size2", "p_value")
df.word.doublenbinom <- as.data.frame(df.word.doublenbinom)
df.word.doublenbinom <- df.word.doublenbinom |> mutate(p1 = as.numeric(p1), size1 = as.numeric(size1), p2 = as.numeric(p2), size2 = as.numeric(size2), p_value = as.numeric(p_value))
timediff <- difftime(Sys.time(),start_time)
cat("Расчёт занял: ", timediff, units(timediff))
Рассчитаем параметры для первых \(1000\) слов:
df.word.logpois <- data.frame()
start_time <- Sys.time()
cl <- makeCluster(5, type = "SOCK")
registerDoSNOW(cl)
df.word.logpois <- foreach(i = 1:num.word, .combine = rbind, .inorder = TRUE) %dopar% {
histWord <- hist(as.numeric(WordSample[i, ]), probability = TRUE, breaks = seq(-0.5, max(WordSample[i, ]) + 0.5, 1), xlim = c(-0.5, max(WordSample[i, ]) + 0.5), plot = FALSE)
res <- hist_make(0, histWord$counts, get_hist = FALSE, distr = 'logpois')
c(row.names(WordSample)[i], res)
}
colnames(df.word.logpois) <- c("name", "lambda", "q", "p_value")
df.word.logpois <- as.data.frame(df.word.logpois)
df.word.logpois <- df.word.logpois |> mutate(lambda = as.numeric(lambda), q = as.numeric(q), p_value = as.numeric(p_value))
timediff <- difftime(Sys.time(),start_time)
cat("Расчёт занял: ", timediff, units(timediff))
Расчёт занял: 2.395301 mins
Моделирование отрицательного бинома с параметром prob, распределённом по логарифму:
r_yule_simon_nbinom <- function(n, ro, size){
res <- c()
for (i in rnlog(n, log_prepering(ro))){
res <- c(res, rnbinom(1, size, exp(-i)))
}
res
}
Вероятности этого распределения и гистограмма:
get_prob_yule_simon_nbinom <- function(k, q = 0.5, n = 1, max_num = 1000){
res <- 0
for(i in 1:max_num){
nb <- dnbinom(k, n, exp(-i))
if (is.nan(nb) | nb == 0){
break
}
res <- res + nb * (-q ** i / log(1 - q) / i)
}
res
}
yule_simon.q <- 0.707
yule_simon.n <- 20.99
samp.yule_simon_nbinom <- r_yule_simon_nbinom(10000, yule_simon.q, yule_simon.n)
samp.yule_simon_nbinom <- samp.yule_simon_nbinom[samp.yule_simon_nbinom < 200]
hist.default(samp.yule_simon_nbinom, breaks = seq(-0.5, max(samp.yule_simon_nbinom) + 0.5, 1), probability = TRUE)
points(x = 0:200, y = sapply(0:200, function(n) get_prob_yule_simon_nbinom(n, yule_simon.q, yule_simon.n)))
Функция максимального правдоподобия:
m_log_lik_yule_simon_nbinom <- function(x.in, q = 0.5, n = 1){
if (q <= 0 || q >= 1 || n <= 0){
return(-log(0));
}
res <- 0
for (i in x.in){
prob <- get_prob_yule_simon_nbinom(i, q, n)
if (prob < 0 || prob > 1 || is.nan(prob))
return(-log(0))
res <- res + log(prob)
}
-res
}
Рассчитаем параметры для первых \(1000\) слов:
df.word.yulesimonnbinom <- data.frame()
start_time <- Sys.time()
cl <- makeCluster(5, type = "SOCK")
registerDoSNOW(cl)
df.word.yulesimonnbinom <- foreach(i = 1:num.word, .combine = rbind, .inorder = TRUE) %dopar% {
histWord <- hist(as.numeric(WordSample[i, ]), probability = TRUE, breaks = seq(-0.5, max(WordSample[i, ]) + 0.5, 1), xlim = c(-0.5, max(WordSample[i, ]) + 0.5), plot = FALSE)
res <- hist_make(0, histWord$counts, get_hist = FALSE, distr = 'yulesimonnbinom')
c(row.names(WordSample)[i], res)
}
colnames(df.word.yulesimonnbinom) <- c("name", "q", "n", "p_value")
df.word.yulesimonnbinom <- as.data.frame(df.word.yulesimonnbinom)
df.word.yulesimonnbinom <- df.word.yulesimonnbinom |> mutate(q = as.numeric(q), n = as.numeric(n), p_value = as.numeric(p_value))
timediff <- difftime(Sys.time(),start_time)
cat("Расчёт занял: ", timediff, units(timediff))
df.word.yulesimonnbinom <- df.word.yulesimonnbinom |> mutate(p_value = ifelse(p_value == 1, 0, p_value))
Гистограммы слова по всем расперделениям:
Сведём результаты в одну таблицу:
df.word.all <- cbind(df.word.nbinom, select(df.word.binlog, -name), select(df.word.doublenbinom, -name), freq = apply(WordSample, 1, sum)[1:1000])
colnames(df.word.all) <- c("name", "n_nbinom", "q_nbinom", "pvalue_nbinom", "n_binomlog", "q_binomlog", "p_binomlog", "pvalue_binomlog", "p1_doublenbinom", "size1_doublenbinom", "p2_doublenbinom", "size2_doublenbinom", "pvalue_doublenbinom", "freq")
Зададим уровень значимости:
alpha <- 0.05
Для каких слов все распределения хороши:
df.word.all |> select(name, pvalue_nbinom, pvalue_binomlog, pvalue_doublenbinom) |> filter(pvalue_nbinom > alpha & pvalue_binomlog > alpha & pvalue_doublenbinom > alpha)
Для каких слов лучше свёртка двух биномов:
df.word.all |> select(name, pvalue_nbinom, pvalue_binomlog, pvalue_doublenbinom) |> filter(pvalue_doublenbinom > alpha & pvalue_binomlog < alpha & pvalue_nbinom < alpha)
Для каких слов БЛР лучше:
df.word.all |> select(name, pvalue_nbinom, pvalue_binomlog, pvalue_doublenbinom) |> filter(pvalue_binomlog > alpha & pvalue_doublenbinom < alpha & pvalue_nbinom < alpha)
Для каких слов лучше негативный бином:
df.word.all |> select(name, pvalue_nbinom, pvalue_binomlog, pvalue_doublenbinom) |> filter(pvalue_nbinom > alpha & pvalue_doublenbinom < alpha & pvalue_binomlog < alpha)
Для каких слов лучше отрицательный бином и БЛР, но не свёртка:
df.word.all |> select(name, pvalue_nbinom, pvalue_binomlog, pvalue_doublenbinom) |> filter(pvalue_doublenbinom < alpha & pvalue_nbinom > alpha & pvalue_binomlog > alpha)
Для каких слов лучше отрицательный бином и свёртка, но не БЛР:
df.word.all |> select(name, pvalue_nbinom, pvalue_binomlog, pvalue_doublenbinom) |> filter(pvalue_doublenbinom > alpha & pvalue_nbinom > alpha & pvalue_binomlog < alpha)
Для каких слов лучше БЛР и свёртка, но не отрицательный бином:
df.word.all |> select(name, pvalue_nbinom, pvalue_binomlog, pvalue_doublenbinom) |> filter(pvalue_doublenbinom > alpha & pvalue_nbinom < alpha & pvalue_binomlog > alpha)
Для каких слов все плохи:
df.word.all |> select(name, pvalue_nbinom, pvalue_binomlog, pvalue_doublenbinom) |> filter(pvalue_nbinom < alpha & pvalue_doublenbinom < alpha & pvalue_binomlog < alpha)
Отдельные классы для слов:
df.word.all <- df.word.all |>
mutate(class = ifelse(pvalue_doublenbinom > alpha & pvalue_nbinom > alpha & pvalue_binomlog > alpha, "Все", "-")) |>
mutate(class = ifelse(pvalue_doublenbinom > alpha & pvalue_nbinom < alpha & pvalue_binomlog < alpha, "Сумма", class)) |>
mutate(class = ifelse(pvalue_doublenbinom < alpha & pvalue_nbinom < alpha & pvalue_binomlog > alpha, "БЛР", class)) |>
mutate(class = ifelse(pvalue_doublenbinom < alpha & pvalue_nbinom > alpha & pvalue_binomlog < alpha, "ОБР", class)) |>
mutate(class = ifelse(pvalue_doublenbinom > alpha & pvalue_nbinom > alpha & pvalue_binomlog < alpha, "ОБР и сумма", class)) |>
mutate(class = ifelse(pvalue_doublenbinom < alpha & pvalue_nbinom > alpha & pvalue_binomlog > alpha, "ОБР и БЛР", class)) |>
mutate(class = ifelse(pvalue_doublenbinom > alpha & pvalue_nbinom < alpha & pvalue_binomlog > alpha, "БЛР и сумма", class)) |>
mutate(class = ifelse(pvalue_doublenbinom < alpha & pvalue_nbinom < alpha & pvalue_binomlog < alpha, "Ни один", class))
Посмотрим на точечный график:
df.word.all |> plot_ly(x = ~log(n_nbinom), y = ~q_nbinom, z = ~log(freq), color = ~class, text = ~paste('word:', name), size = I(150), alpha = 1, width = 1200, height = 700) |>
layout(scene = list(xaxis = list(title = 'Логарифм параметра n негативного бинома'), yaxis = list(title = 'Параметр q негативного бинома'), zaxis = list(title = 'Логарифм количества')))
No trace type specified:
Based on info supplied, a 'scatter3d' trace seems appropriate.
Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
No scatter3d mode specifed:
Setting the mode to markers
Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
No trace type specified:
Based on info supplied, a 'scatter3d' trace seems appropriate.
Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
No scatter3d mode specifed:
Setting the mode to markers
Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode